NetFlix <- read_csv("/Users/bogi/Downloads/netflix_titles.csv") %>%
mutate(date_added = mdy(date_added)) %>% clean_names()
glimpse(NetFlix)
## Rows: 7,787
## Columns: 12
## $ show_id <chr> "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s1…
## $ type <chr> "TV Show", "Movie", "Movie", "Movie", "Movie", "TV Show",…
## $ title <chr> "3%", "7:19", "23:59", "9", "21", "46", "122", "187", "70…
## $ director <chr> NA, "Jorge Michel Grau", "Gilbert Chan", "Shane Acker", "…
## $ cast <chr> "João Miguel, Bianca Comparato, Michel Gomes, Rodolfo Val…
## $ country <chr> "Brazil", "Mexico", "Singapore", "United States", "United…
## $ date_added <date> 2020-08-14, 2016-12-23, 2018-12-20, 2017-11-16, 2020-01-…
## $ release_year <dbl> 2020, 2016, 2011, 2009, 2008, 2016, 2019, 1997, 2019, 200…
## $ rating <chr> "TV-MA", "TV-MA", "R", "PG-13", "PG-13", "TV-MA", "TV-MA"…
## $ duration <chr> "4 Seasons", "93 min", "78 min", "80 min", "123 min", "1 …
## $ listed_in <chr> "International TV Shows, TV Dramas, TV Sci-Fi & Fantasy",…
## $ description <chr> "In a future where the elite inhabit an island paradise f…
gg_miss_which(NetFlix)
gg_miss_upset(NetFlix)
NetFlix %>% head()
## # A tibble: 6 × 12
## show_id type title director cast country date_added release_year rating
## <chr> <chr> <chr> <chr> <chr> <chr> <date> <dbl> <chr>
## 1 s1 TV Show 3% <NA> João… Brazil 2020-08-14 2020 TV-MA
## 2 s2 Movie 7:19 Jorge Mich… Demi… Mexico 2016-12-23 2016 TV-MA
## 3 s3 Movie 23:59 Gilbert Ch… Tedd… Singap… 2018-12-20 2011 R
## 4 s4 Movie 9 Shane Acker Elij… United… 2017-11-16 2009 PG-13
## 5 s5 Movie 21 Robert Luk… Jim … United… 2020-01-01 2008 PG-13
## 6 s6 TV Show 46 Serdar Akar Erda… Turkey 2017-07-01 2016 TV-MA
## # ℹ 3 more variables: duration <chr>, listed_in <chr>, description <chr>
NetFlix %>% count(type, sort = T) %>%
mutate(prop = paste0(round(n / sum(n) * 100, 0), "%")) %>%
ggplot(aes(x = "", y = prop, fill = type)) +
geom_bar(
stat = "identity",
width = 1,
color = "steelblue",
size = 1
) +
coord_polar("y", start = 0) +
geom_text(
aes(y = prop, label = prop),
position = position_stack(vjust = 0.5),
size = 6,
col = "white",
fontface = "bold"
) +
scale_fill_manual (values = c('#e41a1c', '#377eb8')) +
theme_void() +
labs(
title = "Are Movies on Netflix more than TV shows?",
subtitle = "Pie Plot, proportion of Movies to TV shows",
caption = "Kaggle: Netflix Movies and TV Shows",
fill = ""
)
The result shows Movies are more than TV shows.
NetFlix <- NetFlix %>%
mutate(year_diff = year(date_added)-release_year)
NetFlix %>% count(year_diff, sort = F)
## # A tibble: 75 × 2
## year_diff n
## <dbl> <int>
## 1 -3 1
## 2 -2 1
## 3 -1 10
## 4 0 2825
## 5 1 1485
## 6 2 644
## 7 3 439
## 8 4 336
## 9 5 226
## 10 6 218
## # ℹ 65 more rows
Checking example years below
10 items added before release year, 1 year
1 item added before release year, 2 year
1 added before release year, 3 year
datatable(
NetFlix %>% select(-cast, -description) %>%
filter(year_diff < 0) %>%
arrange(year_diff),
caption = NULL,
options = list(dom = 't')
)
NetFlix %>% select(year_diff) %>%
filter(!is.na(year_diff)) %>%
plot_ly(x = ~ year_diff,
type = "histogram",
marker = list(line = list(color = "darkgray",
width = 1))) %>%
layout(
title = "Year difference between release_year and date_added",
yaxis = list(title = "Count",
zeroline = FALSE),
xaxis = list(title = "difference (Years)",
zeroline = FALSE)
)
Checking example of 90 years difference
datatable(NetFlix %>% select(title, type, release_year, date_added, year_diff) %>%
filter(year_diff > 60) %>%
arrange(desc(year_diff)),
caption = NULL,
options = list(dom = 't')
)
NetFlix %>% select(rating, type) %>%
filter(!is.na(rating)) %>%
mutate(rating = fct_lump(rating, 5)) %>%
group_by(rating, type) %>%
summarise(Count = n()) %>%
arrange(Count) %>%
plot_ly(
x = ~ type ,
y = ~ Count,
type = "bar",
color = ~ rating,
text = ~ Count,
textposition = 'outside',
textfont = list(color = '#000000', size = 12)
) %>%
layout(yaxis = list(categoryorder = "array",
categoryarray = ~ Count)) %>%
layout(
title = "Rating by Type",
yaxis = list(title = "Type"),
xaxis = list(title = "Count"),
legend = list(title = list(text = '<b> Rating </b>'))
)
NetFlix %>% select(country) %>%
filter(!is.na(country)) %>%
mutate(country = fct_lump(country, 10)) %>%
group_by(country) %>%
summarise(Count = n()) %>%
arrange(Count) %>%
plot_ly(
x = ~ Count ,
y = ~ country,
type = "bar",
orientation = 'h'
) %>%
layout(yaxis = list(categoryorder = "array", categoryarray = ~ Count)) %>%
layout(
title = "Items distribution by Country",
yaxis = list(title = "Country"),
xaxis = list(title = "Count")
)
NetFlix %>% select(country) %>%
filter(!is.na(country)) %>%
mutate(country = fct_lump(country, 45)) %>%
group_by(country) %>%
summarise(Count = n()) %>%
arrange(Count) %>%
plot_ly(
x = ~ Count ,
y = ~ country,
type = "bar",
orientation = 'h'
) %>%
layout(yaxis = list(categoryorder = "array", categoryarray = ~ Count)) %>%
layout(
title = "Items distribution by Country",
yaxis = list(title = "Country"),
xaxis = list(title = "Count")
)
datatable(NetFlix %>%
select(-cast, -description) %>%
filter(!is.na(country),
country == "Lebanon"),
caption = NULL,
options = list(dom = 't'))
movies <- NetFlix %>% select(country, type, duration, rating, title) %>%
filter(type == "Movie") %>%
drop_na() %>%
mutate(duration_min = parse_number(duration))
tv_show <- NetFlix %>% select(country, type, duration, rating, title) %>%
filter(type == "TV Show") %>%
drop_na() %>%
mutate(duration_season = parse_number(duration))
movies %>%
plot_ly(
x = ~ duration_min,
type = "histogram",
nbinsx = 40,
marker = list(
color = "drakblue",
line = list(color = "black",
width = 1)
)
) %>%
layout(
title = "Duration distrbution",
yaxis = list(title = "Count",
zeroline = FALSE),
xaxis = list(title = "Duration (min)",
zeroline = FALSE)
)
datatable(movies %>% select(title, duration_min) %>%
filter(duration_min >200) %>% arrange(desc(duration_min)),
caption = NULL,
options = list(dom = 't'))
tv_show %>% select(duration_season) %>%
count(duration_season, sort = TRUE) %>%
ggplot(aes(
x = as.factor(duration_season),
y = n,
label = n
)) +
geom_col(aes(fill = duration_season)) +
geom_text(vjust = -0.5, size = 3, col = "darkgreen") +
theme_light() +
theme(legend.position = "none") +
labs(x = "Season duration",
y = "Count",
title = "Season distrbution",
subtitle = "Column Plot, Season distrbution",
caption = "Kaggle: Netflix Movies and TV Shows",
fill = ""
)
datatable(tv_show %>% select(title, duration_season) %>%
filter(duration_season >15) %>% arrange(desc(duration_season)),
caption = NULL,
options = list(dom = 't'))
ggplotly(
NetFlix %>% select(date_added) %>%
filter(!is.na(date_added)) %>%
mutate(year_added = year(date_added)) %>%
group_by(year_added) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
ggplot(aes(
x = year_added,
y = Count,
label = Count
)) +
geom_line(size = 1, col = "darkred", alpha = 0.5) +
geom_col(alpha = 0.6, fill = "steelblue") +
geom_text(vjust = -0.7, size = 3) +
theme_light() +
scale_y_continuous(label = comma) +
labs(
x = "Year Added",
y = "Count",
title = "Number of Items added per year",
subtitle = "Column and line Plot, Nunber of Items added per year",
caption = "Kaggle: Netflix Movies and TV Shows"
)
)
desc_words_m <- NetFlix %>% select(type, show_id, description) %>%
filter(type == "Movie") %>%
unnest_tokens(word, description) %>%
anti_join(stop_words)
count_word <- desc_words_m %>%
count(word, sort = TRUE)
wordcloud(words = count_word$word,
freq = count_word$n,
min.freq = 50,
max.words = nrow(count_word),
random.order = FALSE,
rot.per = 0.1,
colors = brewer.pal(8, "Dark2"))
desc_words_tv <- NetFlix %>% select(type, show_id, description) %>%
filter(type == "TV Show") %>%
unnest_tokens(word, description) %>%
anti_join(stop_words)
count_word <- desc_words_tv %>%
count(word, sort = TRUE)
wordcloud(words = count_word$word,
freq = count_word$n,
min.freq = 30,
max.words = nrow(count_word),
random.order = FALSE,
rot.per = 0.1,
colors = brewer.pal(8, "Dark2"))